################################################################################ 
#
# The distribution of hate speech and its implications for content moderation
# PSRM - Replication package
# Create controls for lasso selection
#
################################################################################ 



################################################################################ 
#  LIBRARIES
################################################################################ 

library(dplyr)
library(tidyr)
library(readr)
library(fastDummies)

rm(list = setdiff(ls(), ls(pattern = "^wd|^setsave$")))

################################################################################ 
#   DATA AND FOLDER
################################################################################ 

# load data
data <- read.csv(
  paste0(wd_data, '/ch_twitter_data_upd.csv'),
  colClasses = c(user_id = "character", status_id = "character")
)

# drop attrited users
data = data[which(data$attrition==0 & data$not_auth_sofar==0 & data$suspended_sofar==0 & data$protected_sofar==0),]

# year
data$year_agg = cut(as.numeric(data$creation_year), c(2022, 2019, 2015, 2005) , labels = F, include.lowest = T)

################################################################################ 
#   VARIABLES
################################################################################

ids = c("user_id", "date")

cont_cont = c( "age", "favourites_count", "followers_count",
               "friends_count", "no_hate_tweets_pre_daily", "no_tweets_pre_daily", "pre_tweets_h_avg")
dummy_cont = c("description", "location", "is_newspapers", "sample_politically_interested", "is_politicians")
cart_cont = c("year_agg", "lang", "user_anonymity")

all_cont = c(cont_cont, cart_cont, dummy_cont)

################################################################################ 
#   Select controls for quintile transformation
################################################################################ 

temp = lapply(data[cont_cont], function(x) 
  as.factor(cut(x, quantile(x, na.rm=T), labels = F, include.lowest = T))) %>% as.data.frame()

cont_dummies = data.frame(matrix(nrow=nrow(data)))
for (dummy in cont_cont){
  temp2 = dummy_cols(temp[dummy])[,-1]
  cont_dummies = cbind(cont_dummies, temp2)
}
cont_dummies = cont_dummies[,-1]


################################################################################ 
#   Create dummies for date
################################################################################ 


cart_dummies = data.frame(matrix(nrow=nrow(data)))
for (dummy in cart_cont){
  temp2 = dummy_cols(data[dummy])[,-1]
  cart_dummies = cbind(cart_dummies, temp2)
}
cart_dummies = cart_dummies[,-1]


################################################################################ 
#   Lasso Dataset
################################################################################ 

# what we want interacted 
X = cbind.data.frame(data[dummy_cont],
                     cart_dummies,
                     cont_dummies)


X <- lapply(X, as.numeric) %>% as.data.frame()

# find  all first interactions
C = data['user_id']
done = c()

for (n in names(X)){
  for (m in names(X)){
    
    name = sort(c(n, m), decreasing=T)
    name = paste0(name[1], '_', name[2])
    
    if (name %in% done){
      next
    }else{
      prod <- X[n] * X[m]
      C[name] = prod
      done = c(done, name)
    }
  }}


final = C
for(i in 1:ncol(final)){
  final[is.na(final[,i]), i] <- mean(final[,i], na.rm = TRUE)
}

final = final[, which(apply(final, 2, var) != 0)] # exclude all constant variables

write_rds(final, paste0(wd_data_processed, '/for_analysis/data_lasso.RDS'))
cat("\n====================\n")
cat("Created lasso data")
cat("\n====================\n")

################################################################################ 
#   Clean Dataset for Analysis
################################################################################ 


data <- read.csv(paste0(wd_data, '/ch_twitter_data_upd.csv'))


# remove columns that are no longer needed
data[names(data)[startsWith(names(data), c('protected.'))]] = NULL
data[names(data)[startsWith(names(data), c('suspended.'))]] = NULL
data[names(data)[startsWith(names(data), c('tweet_not_authorized.'))]] = NULL

#Create grouped treatments
data$treat_aggregate = ifelse(data$treat_label %in% c('empathy_get', 'empathy_take'), 'empathy', 
                              ifelse(data$treat_label %in% c('ban', 'humor'), 'alert', data$treat_label))

write.csv(data, paste0(wd_data_processed, '/for_analysis/clean_for_analysis.csv'), row.names = F )
cat("\n====================\n")
cat("Created data for analysis")
cat("\n====================\n")